🧐 Analyser av stortingsdata 🏰
Universitetet i Oslo
<martin.soyland@stv.uio.no>
2023-09-28
Gilardi and Wüest (2018)
get_session_mp_speech_activity <- function(sessionid = NA, mp_id = NA, good_manners = 0){
url <- paste0(
"https://data.stortinget.no/eksport/representanttaleaktiviteter?personid=",
mp_id, "&sesjonid=", sessionid)
base <- GET(url)
resp <- http_type(base)
if(resp != "text/xml") stop(paste0("Response of ", url, " is not text/xml."), call. = FALSE)
status <- http_status(base)
if(status$category != "Success") stop(paste0("Response of ", url, " returned as '", status$message, "'"), call. = FALSE)
tmp <- read_html(base)
tmp2 <- data.frame(response_date = tmp |> html_elements("representant_tale_aktivitet_oversikt > respons_dato_tid") |> html_text(),
version = tmp |> html_elements("representant_tale_aktivitet_oversikt > versjon") |> html_text(),
session_id = tmp |> html_elements("representant_tale_aktivitet_oversikt > sesjon_id") |> html_text(),
agenda_case_number = tmp |> html_elements("representant_tale_aktivitet > dagsorden_sak_nummer") |> html_text(),
meeting_id = tmp |> html_elements("representant_tale_aktivitet > mote_id") |> html_text(),
speech_start_time = tmp |> html_elements("representant_tale_aktivitet > tale_start_tid") |> html_text(),
speech_type = tmp |> html_elements("representant_tale_aktivitet > tale_type") |> html_text(),
speech_length_secs = tmp |> html_elements("representant_tale_aktivitet > tale_varighet_sekunder") |> html_text())
Sys.sleep(good_manners)
return(tmp2)
}| Møte | Sak nr. | Start | Tid | Type |
|---|---|---|---|---|
| 11098 | 1 | 2022-10-05T15:22:31.67 | 3min. 0sek. | 3MIN |
| 11110 | 1 | 2022-10-25T11:21:08.367 | 5min. 15sek. | INNL |
| 11128 | 1 | 2022-12-01T15:20:52.967 | 4min. 38sek. | INNL |
| 11117 | 1 | 2022-11-16T10:18:37.93 | 1min. 55sek. | MSPT_SVAR |
| 11197 | 1 | 2023-05-10T10:04:01.513 | 2min. 0sek. | MSPT_SVAR |
| 11109 | 1 | 2022-10-26T11:04:44.1 | 1min. 3sek. | MSPT_TILSVA |
| 11109 | 1 | 2022-10-26T11:25:32.657 | 15sek. | MSPT_TILSVA |
| 11110 | 1 | 2022-10-25T10:03:29.447 | 30min. 12sek. | REDGJ |
| 11154 | 1 | 2023-02-02T10:07:40.013 | 36min. 39sek. | REDGJ |
| 11098 | 1 | 2022-10-05T10:42:25.26 | 1min. 5sek. | REPLSV |
| 11128 | 1 | 2022-12-01T15:35:38.28 | 59sek. | REPLSV |
Talk of NorwayLapponi et al. (2018)
Oslo-Bergen taggerDomenetap, at norsk blir erstattet av engelsk innenfor et spesielt område, er ofte sett på som en av de store truslene mot den norske språkutviklingen. De klassiske eksemplene på dette er transnasjonale selskaper som tar i bruk engelsk på arbeidsplassen, og innenfor utdanningssektoren hvor f.eks. høyere utdanning foregår på engelsk.
SP
SV
Spørsmål i StortingetBjørkholt and Søyland (2022)
[W]hy would a party […] invest in oversight activities? The reason […] is that an opposition party will pursue electoral and office goals, and these goals can be achieved through engaging in executive oversight.
Whitaker and Martin (2021)
Barnes et al. (2019) NorSentLex
Representasjon i spørsmålSøyland (2022)
Eirin Sund (Arbeiderpartiet, 2014-05-20):
Vi snakker her om foreldre, altså mødre og fedre, som ønsker å delta i sine barns aktiviteter, som f.eks. en fotballturnering. Du kan ikke komme deg fra Ålgård til Kverneland, det går ikke buss
Warning message: In spacy_parse.character(text, pos = TRUE, tag = FALSE, lemma = TRUE,:lemmatization may not work properly in model ‘nb_core_news_lg’
| doc_id | sentence_id | token_id | token | lemma | pos | entity | |
|---|---|---|---|---|---|---|---|
| 26 | text1 | 2 | 1 | Du | du | PRON | |
| 27 | text1 | 2 | 2 | kan | kan | AUX | |
| 28 | text1 | 2 | 3 | ikke | ikke | PART | |
| 29 | text1 | 2 | 4 | komme | komme | VERB | |
| 30 | text1 | 2 | 5 | deg | deg | PRON | |
| 31 | text1 | 2 | 6 | fra | fra | ADP | |
| 32 | text1 | 2 | 7 | Ålgård | Ålgård | PROPN | LOC_B |
| 33 | text1 | 2 | 8 | til | til | ADP | |
| 34 | text1 | 2 | 9 | Kverneland | Kverneland | PROPN | LOC_B |
| 35 | text1 | 2 | 10 | , | , | PUNCT | |
| 36 | text1 | 2 | 11 | det | det | PRON | |
| 37 | text1 | 2 | 12 | går | gåre | VERB | |
| 38 | text1 | 2 | 13 | ikke | ikke | PART | |
| 39 | text1 | 2 | 14 | buss | buss | NOUN | |
| 40 | text1 | 2 | 15 | . | . | PUNCT |
Eksterne sjokkFinseraas, Høyland, and Søyland (2021)
# Loading packages
library(readr) # Read data through C++
library(dplyr) # Parse manipulation of data through C++
library(quanteda) # Manipulate text data
library(tidytext) # Structure text through C++
library(tonR) # Some functions for reading Talk of Norway CoNLL-data
library(pbmcapply) # Parallelization with progress bar
library(stringr)
# Reading the main frame of ToN
meta <- read_csv("../../../gitDebates/talk-of-norway/data/ton_updated.csv", progress = TRUE)
com <- pbmclapply(1:nrow(meta), function(x){
# Extrating committee membership
data.frame(com_member = unlist(strsplit(meta$com_member[x], " ; ")),
com_date = unlist(strsplit(meta$com_date[x], " ; ")),
com_role = unlist(strsplit(meta$com_role[x], " ; ")),
date = meta$date[x],
stringsAsFactors = FALSE)
}, mc.cores = detectCores()-1)
# Seting id on the speeches
names(com) <- meta$id
# Binding all rows
com <- do.call(rbind, com)
# Fixing id names
com$id <- gsub("\\.(.*?)$", "", rownames(com))
# Making start and end date variables for committee membership
com$date <- as.Date(com$date)
com$com_start <- as.Date(str_split(com$com_date, " - ", n = 2, simplify = TRUE)[, 1], format = "%d.%m.%Y")
com$com_end <- as.Date(str_split(com$com_date, " - ", n = 2, simplify = TRUE)[, 2], format = "%d.%m.%Y")
com$com_check <- ifelse(com$com_start < com$date & com$com_end > com$date, "yes", "no")
# Removing rows with committee membership that was not active at the time of the speech
com <- com[which(com$com_check != "no"), ]
com$com_role2 <- factor(com$com_role,
levels = rev(c("Stortingets president", "Stortingets visepresident og forsvarskomiteens leder",
"Leder", "Forsvarskomiteens leder", "Fung. leder gruppestyret",
"Nestleder", "Sekretær", "Første nestleder", "Andre nestleder",
"Medlem", "Varamedlem", "Personlig varamedlem")))
com <- com |>
group_by(id) |>
summarize(high_com = com_member[which.max(com_role2)],
com_role = com_role2[which.max(com_role2)])
# Making dummy sets for all committee memberships
for(i in unique(com$high_com)){
com[, tolower(gsub(" |[[:punct:]]", "", i))] <- ifelse(com$high_com == i, 1, 0)
}
# Aggregating to speech level
com <- com |>
group_by(id, com_role) |>
summarize_at(vars(matches("komit")), sum)
# Merging with original data
meta <- merge(x = meta, y = com, by = c("id"), all.x = TRUE)
# Subsetting the data to use
meta <- meta |>
filter(is.na(party_id) == FALSE &
is.na(county) == FALSE &
is.na(session) == FALSE) |> # Excludes rows with missing on party_id, county, and session
filter(speaker_role == "Representant") |> # Subsetting only representatives
filter(session == "2013-2014" |
session == "2014-2015") |> # Subsetting the two relevant sessions
filter_at(vars(matches("komit")), any_vars(is.na(.) == FALSE))
meta$com_role <- meta$com_role.y
meta$com_role.y <- NULL
meta$com_role.x <- NULL
meta$com_role <- ifelse(meta$com_role == "Varamedlem", "Medlem", as.character(meta$com_role))
# Reading the CoNLL-files for the remaining data
lemmas_all <- pbmclapply(meta$id, function(x){
read.conll("../../../gitDebates/talk-of-norway/", x)
}, mc.cores = detectCores()-1, ignore.interactive = TRUE)
# From list to data frame
lemmas_all <- bind_rows(lemmas_all)
# Manipulating the text
lemmas_all <- lemmas_all |>
filter(grepl("[[:punct:]]|\\–|^AV$", lemma) == FALSE) |> # Removing "AV" because of tagger fail (not important as it is a stopword)
group_by(id, sentence) |>
mutate(lemma = tolower(lemma), # Lowercasing all lemma
lemma = ifelse(lemma %in% stopwords("norwegian"), NA, lemma), # Removing stopwords
lemma_pos = paste(lemma, part_of_speech, sep = ":"),
next_lemma_pos = lead(lemma_pos), # Leading lemma for bigram construction
lemma_pos_bigram = ifelse(grepl("^NA\\:|\\:NA$", lemma_pos) | grepl("^NA\\:|\\:NA$", next_lemma_pos), # Constructing bibrams
NA, paste(lemma_pos, next_lemma_pos)),
lemma_pos = ifelse(grepl("^NA\\:|\\:NA$", lemma_pos), NA, lemma_pos)) |>
filter((is.na(lemma_pos) == FALSE | is.na(lemma_pos_bigram) == FALSE))
lemmas_all <- lemmas_all |>
mutate(lemma_pos_bigram = ifelse(grepl("\\sNA$", lemma_pos_bigram) == FALSE, lemma_pos_bigram, NA)) |>
filter(is.na(lemma_pos) == FALSE | is.na(lemma_pos_bigram) == FALSE)
# Counting lemma bigrams
lemma_unigrams_pos <- lemmas_all |>
filter(is.na(lemma_pos) == FALSE) |>
group_by(id) |>
count(lemma_pos)
# Counting lemma bigrams
lemma_bigrams_pos <- lemmas_all |>
filter(is.na(lemma_pos_bigram) == FALSE) |>
group_by(id) |>
count(lemma_pos_bigram)
names(lemma_bigrams_pos)[2] <- "lemma_pos"
lemma_counts <- bind_rows(lemma_unigrams_pos, lemma_bigrams_pos) |>
filter(is.na(lemma_pos) == FALSE) |> # Removing if lemma is NA
group_by(id) |> # Grouping by speech
filter(length(lemma_pos) >= 20) |> # Removing speeches shorter than 20 lemmas
count(lemma_pos) |> # Counting the occurance of each lemma in each group
cast_dfm(id, lemma_pos, nn) |> # Casting to document frequency matrix
dfm_trim(min_docfreq = 20, min_termfreq = 5) # Trimming dfm to only contain speeches with more than 19 lemmas and
# lemmas that occur more than 4 times across all documents
# Subsetting the speeches that remain in the meta data
meta <- meta[which(meta$id %in% lemma_counts@Dimnames[[1]]), ]
# Removing redundant objects
rm(lemmas_all, lemma_bigrams_pos, lemma_unigrams_pos, i, com)
# Saving the R-environment
save.image(file = "./data/preprocess/reps_preproc_bigram_pos_com.rda") features
docs forutsetning:subst forutsette:verb forvente:verb fram:prep
tale211650 1 1 1 1
tale211684 0 0 0 1
tale211693 0 0 0 0
tale211703 0 0 0 0
tale211705 0 0 0 0
tale211714 0 0 0 1
tale211716 0 0 0 0
tale211717 0 0 0 1
tale211718 0 0 0 0
tale211719 0 0 0 0
Arbeidsledighet
Fornybar vs. olje
Grønne skiftet